This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(tidyverse)
library(lubridate)
library(ggplot2)
getwd()
[1] "C:/Users/MinJae/Documents/Google Capstone - Cyclistic"
setwd("/Users/MinJae/Documents/Google Capstone - Cyclistic")
getwd()
[1] "C:/Users/MinJae/Documents/Google Capstone - Cyclistic"
m1_2021 <- read_csv("Datasets/202101-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202101-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202101-divvy-tripdata.csv [======================================================] 276.45MB/s, eta: 0s
m2_2021 <- read_csv("Datasets/202102-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202102-divvy-tripdata.csv [================----------------------------------------] 2.15GB/s, eta: 0s
indexing 202102-divvy-tripdata.csv [======================================================] 334.83MB/s, eta: 0s
m3_2021 <- read_csv("Datasets/202103-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202103-divvy-tripdata.csv [====-----------------------------------------------------------] ?, eta: 0s
indexing 202103-divvy-tripdata.csv [======================================================] 294.72MB/s, eta: 0s
m4_2021 <- read_csv("Datasets/202104-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202104-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202104-divvy-tripdata.csv [================================================------] 284.41MB/s, eta: 0s
indexing 202104-divvy-tripdata.csv [====================================================--] 283.80MB/s, eta: 0s
indexing 202104-divvy-tripdata.csv [======================================================] 282.84MB/s, eta: 0s
m5_2021 <- read_csv("Datasets/202105-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202105-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202105-divvy-tripdata.csv [==================================--------------------] 293.10MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [=====================================-----------------] 294.91MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [=======================================---------------] 292.42MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [=========================================-------------] 290.23MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [===========================================-----------] 288.90MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [=============================================---------] 285.53MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [================================================------] 284.03MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [===================================================---] 291.88MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [=====================================================-] 283.11MB/s, eta: 0s
indexing 202105-divvy-tripdata.csv [======================================================] 282.35MB/s, eta: 0s
m6_2021 <- read_csv("Datasets/202106-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202106-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202106-divvy-tripdata.csv [=======================-------------------------------] 286.49MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [=========================-----------------------------] 277.28MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [============================--------------------------] 267.97MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [==============================------------------------] 269.73MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [================================----------------------] 271.05MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [==================================--------------------] 274.69MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [====================================------------------] 274.09MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [======================================----------------] 277.50MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [=======================================---------------] 276.76MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [==========================================------------] 273.81MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [============================================----------] 275.21MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [==============================================--------] 277.96MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [=================================================-----] 275.85MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [===================================================---] 278.76MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [=====================================================-] 278.21MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [======================================================] 272.45MB/s, eta: 0s
indexing 202106-divvy-tripdata.csv [======================================================] 269.78MB/s, eta: 0s
m7_2021 <- read_csv("Datasets/202107-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202107-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202107-divvy-tripdata.csv [=====================---------------------------------] 300.08MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=======================-------------------------------] 299.77MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=========================-----------------------------] 294.51MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [===========================---------------------------] 291.49MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=============================-------------------------] 292.02MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [===============================-----------------------] 291.22MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=================================---------------------] 289.79MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [===================================-------------------] 285.03MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [====================================------------------] 285.49MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [======================================----------------] 277.89MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=========================================-------------] 282.34MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [===========================================-----------] 276.30MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=============================================---------] 277.51MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=================================================-----] 285.70MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [=================================================-----] 279.87MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [==================================================----] 274.86MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [====================================================--] 276.93MB/s, eta: 0s
indexing 202107-divvy-tripdata.csv [======================================================] 283.36MB/s, eta: 0s
m8_2021 <- read_csv("Datasets/202108-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202108-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202108-divvy-tripdata.csv [=====================---------------------------------] 286.19MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=======================-------------------------------] 293.45MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=========================-----------------------------] 294.45MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [===========================---------------------------] 287.29MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=============================-------------------------] 286.24MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [===============================-----------------------] 284.31MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=================================---------------------] 287.70MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [====================================------------------] 292.23MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=====================================-----------------] 287.89MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [======================================----------------] 283.42MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [==========================================------------] 289.79MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [==========================================------------] 282.55MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [============================================----------] 282.72MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [==============================================--------] 281.07MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [================================================------] 282.13MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [===================================================---] 283.04MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [=====================================================-] 286.16MB/s, eta: 0s
indexing 202108-divvy-tripdata.csv [======================================================] 291.99MB/s, eta: 0s
m9_2021 <- read_csv("Datasets/202109-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202109-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202109-divvy-tripdata.csv [======================--------------------------------] 271.88MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [========================------------------------------] 271.82MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [==========================----------------------------] 272.38MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=============================-------------------------] 277.58MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [==============================------------------------] 272.27MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [===============================-----------------------] 270.77MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [================================----------------------] 270.78MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=================================---------------------] 265.66MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [====================================------------------] 270.97MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=====================================-----------------] 270.97MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=======================================---------------] 270.97MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [========================================--------------] 269.38MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [==========================================------------] 271.22MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [===========================================-----------] 269.22MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=============================================---------] 269.29MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [==============================================--------] 272.56MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [================================================------] 269.45MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [==================================================----] 272.46MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [===================================================---] 269.55MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [=====================================================-] 270.21MB/s, eta: 0s
indexing 202109-divvy-tripdata.csv [======================================================] 270.73MB/s, eta: 0s
m10_2021 <- read_csv("Datasets/202110-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202110-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202110-divvy-tripdata.csv [============================--------------------------] 288.87MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [==============================------------------------] 289.45MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [================================----------------------] 289.96MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [===================================-------------------] 289.27MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [=====================================-----------------] 289.71MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [=======================================---------------] 292.73MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [=========================================-------------] 285.34MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [===========================================-----------] 285.16MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [=============================================---------] 285.71MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [===============================================-------] 286.21MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [=================================================-----] 283.48MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [===================================================---] 282.98MB/s, eta: 0s
indexing 202110-divvy-tripdata.csv [======================================================] 291.41MB/s, eta: 0s
m11_2021 <- read_csv("Datasets/202111-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202111-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202111-divvy-tripdata.csv [==============================================--------] 271.10MB/s, eta: 0s
indexing 202111-divvy-tripdata.csv [==================================================----] 272.63MB/s, eta: 0s
indexing 202111-divvy-tripdata.csv [=====================================================-] 268.46MB/s, eta: 0s
indexing 202111-divvy-tripdata.csv [======================================================] 269.49MB/s, eta: 0s
m12_2021 <- read_csv("Datasets/202112-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202112-divvy-tripdata.csv [===------------------------------------------------------------] ?, eta: 0s
indexing 202112-divvy-tripdata.csv [======================================================] 293.49MB/s, eta: 0s
m1_2022 <- read_csv("Datasets/202201-divvy-tripdata.csv", show_col_types = FALSE)
indexing 202201-divvy-tripdata.csv [=--------------------------------------------------------------] ?, eta: 0s
indexing 202201-divvy-tripdata.csv [======================================================] 273.96MB/s, eta: 0s
colnames(m1_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m2_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m3_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m4_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m5_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m6_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m7_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m8_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m9_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m10_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m11_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m12_2021)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
colnames(m1_2022)
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "start_lat" "start_lng"
[11] "end_lat" "end_lng" "member_casual"
str(m1_2021)
spec_tbl_df [96,834 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ ride_id : chr [1:96834] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
$ rideable_type : chr [1:96834] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:96834], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
$ ended_at : POSIXct[1:96834], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
$ start_station_name: chr [1:96834] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
$ start_station_id : chr [1:96834] "17660" "17660" "17660" "17660" ...
$ end_station_name : chr [1:96834] NA NA NA NA ...
$ end_station_id : chr [1:96834] NA NA NA NA ...
$ start_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
$ start_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
$ end_lat : num [1:96834] 41.9 41.9 41.9 41.9 41.9 ...
$ end_lng : num [1:96834] -87.7 -87.7 -87.7 -87.7 -87.7 ...
$ member_casual : chr [1:96834] "member" "member" "member" "member" ...
- attr(*, "spec")=
.. cols(
.. ride_id = [31mcol_character()[39m,
.. rideable_type = [31mcol_character()[39m,
.. started_at = [34mcol_datetime(format = "")[39m,
.. ended_at = [34mcol_datetime(format = "")[39m,
.. start_station_name = [31mcol_character()[39m,
.. start_station_id = [31mcol_character()[39m,
.. end_station_name = [31mcol_character()[39m,
.. end_station_id = [31mcol_character()[39m,
.. start_lat = [32mcol_double()[39m,
.. start_lng = [32mcol_double()[39m,
.. end_lat = [32mcol_double()[39m,
.. end_lng = [32mcol_double()[39m,
.. member_casual = [31mcol_character()[39m
.. )
- attr(*, "problems")=<externalptr>
str(m2_2021)
spec_tbl_df [49,622 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ ride_id : chr [1:49622] "89E7AA6C29227EFF" "0FEFDE2603568365" "E6159D746B2DBB91" "B32D3199F1C2E75B" ...
$ rideable_type : chr [1:49622] "classic_bike" "classic_bike" "electric_bike" "classic_bike" ...
$ started_at : POSIXct[1:49622], format: "2021-02-12 16:14:56" "2021-02-14 17:52:38" "2021-02-09 19:10:18" "2021-02-02 17:49:41" ...
$ ended_at : POSIXct[1:49622], format: "2021-02-12 16:21:43" "2021-02-14 18:12:09" "2021-02-09 19:19:10" "2021-02-02 17:54:06" ...
$ start_station_name: chr [1:49622] "Glenwood Ave & Touhy Ave" "Glenwood Ave & Touhy Ave" "Clark St & Lake St" "Wood St & Chicago Ave" ...
$ start_station_id : chr [1:49622] "525" "525" "KA1503000012" "637" ...
$ end_station_name : chr [1:49622] "Sheridan Rd & Columbia Ave" "Bosworth Ave & Howard St" "State St & Randolph St" "Honore St & Division St" ...
$ end_station_id : chr [1:49622] "660" "16806" "TA1305000029" "TA1305000034" ...
$ start_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
$ start_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
$ end_lat : num [1:49622] 42 42 41.9 41.9 41.8 ...
$ end_lng : num [1:49622] -87.7 -87.7 -87.6 -87.7 -87.6 ...
$ member_casual : chr [1:49622] "member" "casual" "member" "member" ...
- attr(*, "spec")=
.. cols(
.. ride_id = [31mcol_character()[39m,
.. rideable_type = [31mcol_character()[39m,
.. started_at = [34mcol_datetime(format = "")[39m,
.. ended_at = [34mcol_datetime(format = "")[39m,
.. start_station_name = [31mcol_character()[39m,
.. start_station_id = [31mcol_character()[39m,
.. end_station_name = [31mcol_character()[39m,
.. end_station_id = [31mcol_character()[39m,
.. start_lat = [32mcol_double()[39m,
.. start_lng = [32mcol_double()[39m,
.. end_lat = [32mcol_double()[39m,
.. end_lng = [32mcol_double()[39m,
.. member_casual = [31mcol_character()[39m
.. )
- attr(*, "problems")=<externalptr>
all_trips <- bind_rows(m1_2021, m2_2021, m3_2021, m4_2021, m5_2021, m6_2021, m7_2021, m8_2021, m9_2021, m10_2021, m11_2021, m12_2021, m1_2022)
all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
colnames(all_trips) #List of column names
[1] "ride_id" "rideable_type" "started_at" "ended_at" "start_station_name"
[6] "start_station_id" "end_station_name" "end_station_id" "member_casual"
nrow(all_trips) #How many rows are in data frame?
[1] 5698833
dim(all_trips) #Dimensions of the data frame?
[1] 5698833 9
head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
str(all_trips) #See list of columns and data types (numeric, character, etc)
tibble [5,698,833 x 9] (S3: tbl_df/tbl/data.frame)
$ ride_id : chr [1:5698833] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
$ rideable_type : chr [1:5698833] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:5698833], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
$ ended_at : POSIXct[1:5698833], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
$ start_station_name: chr [1:5698833] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
$ start_station_id : chr [1:5698833] "17660" "17660" "17660" "17660" ...
$ end_station_name : chr [1:5698833] NA NA NA NA ...
$ end_station_id : chr [1:5698833] NA NA NA NA ...
$ member_casual : chr [1:5698833] "member" "member" "member" "member" ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
ride_id rideable_type started_at ended_at
Length:5698833 Length:5698833 Min. :2021-01-01 00:02:05 Min. :2021-01-01 00:08:39
Class :character Class :character 1st Qu.:2021-06-08 14:51:48 1st Qu.:2021-06-08 15:16:41
Mode :character Mode :character Median :2021-08-03 01:18:38 Median :2021-08-03 02:12:44
Mean :2021-08-01 10:30:05 Mean :2021-08-01 10:51:53
3rd Qu.:2021-09-27 15:07:19 3rd Qu.:2021-09-27 15:27:45
Max. :2022-01-31 23:58:37 Max. :2022-02-01 01:46:16
start_station_name start_station_id end_station_name end_station_id member_casual
Length:5698833 Length:5698833 Length:5698833 Length:5698833 Length:5698833
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
table(all_trips$member_casual)
casual member
2547525 3151308
all_trips <- all_trips %>%
mutate(member_casual = recode(member_casual,
"Subscriber" = "member",
"Customer" = "casual"))
table(all_trips$member_casual)
casual member
2547525 3151308
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)
str(all_trips)
tibble [5,698,833 x 15] (S3: tbl_df/tbl/data.frame)
$ ride_id : chr [1:5698833] "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
$ rideable_type : chr [1:5698833] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:5698833], format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
$ ended_at : POSIXct[1:5698833], format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
$ start_station_name: chr [1:5698833] "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
$ start_station_id : chr [1:5698833] "17660" "17660" "17660" "17660" ...
$ end_station_name : chr [1:5698833] NA NA NA NA ...
$ end_station_id : chr [1:5698833] NA NA NA NA ...
$ member_casual : chr [1:5698833] "member" "member" "member" "member" ...
$ date : Date[1:5698833], format: "2021-01-23" "2021-01-27" "2021-01-21" "2021-01-07" ...
$ month : chr [1:5698833] "01" "01" "01" "01" ...
$ day : chr [1:5698833] "23" "27" "21" "07" ...
$ year : chr [1:5698833] "2021" "2021" "2021" "2021" ...
$ day_of_week : chr [1:5698833] "Saturday" "Wednesday" "Thursday" "Thursday" ...
$ ride_length : 'difftime' num [1:5698833] 625 244 80 702 ...
..- attr(*, "units")= chr "secs"
is.factor(all_trips$ride_length)
[1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
[1] TRUE
head(all_trips, 50)
all_trips_ex1 <- filter(all_trips, ride_length<=0)
head(all_trips_ex1)
all_trips_v2 <- all_trips[!(all_trips$start_station_name == "HQ QR" | all_trips$ride_length<0),]
all_trips_v3 <- subset(all_trips_v2, select = -c(start_station_name,start_station_id,end_station_name,end_station_id))
which(is.na(all_trips_v3$ride_length))
[1] 10166 10167 10169 10170 10171 10172 10986 10990 10995 10996 10998 11005 11371 11376 11377 11384 11385
[18] 11386 12475 12479 12853 12856 12941 12942 12943 12950 12952 12954 12957 12958 12959 12960 12961 12963
[35] 13020 13093 13259 13260 13261 13262 13265 13266 13267 13268 13271 13272 13558 13560 13561 13562 13563
[52] 13564 13567 13570 13572 13923 13924 13925 13933 13934 13935 13936 13937 13939 13940 13941 13943 13945
[69] 13947 14186 14187 14188 14189 14193 14251 14252 14253 14254 14255 15236 15237 15238 15248 15249 15250
[86] 15251 15252 15253 15254 15255 15256 15257 15258 15259 15260 15261 15265 15267 15878 15879 15880 15881
[103] 15882 15883 15884 15885 15886 15887 16139 16141 16142 16143 16144 16145 16227 16228 16232 16244 16248
[120] 16249 16250 16251 16496 16497 16498 16499 16500 16525 16526 16527 16529 16542 16544 16547 16548 16606
[137] 16607 16608 16609 16610 16641 16642 16643 16644 16823 16825 16852 16853 16854 16857 16859 16922 16924
[154] 16925 16926 16927 16928 16929 16930 16931 16932 16933 17028 17096 17097 17099 17100 17102 17103 17105
[171] 17106 17117 17118 17119 17120 17122 17124 17125 17126 17127 17130 17131 17132 17134 17135 17136 17137
[188] 17346 17347 17350 17351 17352 17353 17405 17820 17821 17823 17824 17825 17826 17828 17829 17830 17831
[205] 17832 18744 18746 18747 18748 18752 18753 18754 18755 18756 18757 18758 18759 18760 18762 19485 19486
[222] 19487 19490 19491 19492 19493 19494 19495 19545 19546 19547 19550 19552 19555 19556 19563 19564 19565
[239] 19568 19569 19570 19620 19622 19623 19624 19625 19626 19627 19628 19631 19633 19634 19635 19636 19637
[256] 19700 19701 19702 19703 19704 19705 19708 19709 19806 19807 19808 19823 19826 20982 20984 20986 20987
[273] 20993 20994 20995 20997 20998 20999 21000 21003 21004 21006 21007 21009 21011 21012 21015 21017 21018
[290] 21019 21020 21268 21269 21270 21273 21274 21275 22007 22008 22010 22011 22012 22013 22015 22016 22017
[307] 22054 22055 22056 22057 22058 22060 22061 22062 22063 22064 22065 22076 22077 22078 22079 22080 22081
[324] 22082 22083 22084 22085 22086 22087 22562 22564 22565 22566 22567 22568 22569 22570 22574 22575 22577
[341] 22610 22611 22612 22613 22614 22630 22634 22636 22637 22638 22642 22644 22645 22672 22673 22743 22752
[358] 22776 23022 23024 23027 23028 23123 23125 23126 23128 23138 23140 23141 23142 23143 23144 23145 23146
[375] 23821 23823 23824 23827 23829 23830 23831 23832 23833 23834 23835 23837 23838 23839 23842 23845 23953
[392] 23954 23955 23956 23957 23959 23961 23965 23966 23967 23968 23973 23974 23975 24105 24106 24107 24108
[409] 24173 24174 24175 24176 24177 24611 24612 24613 24614 24615 24616 24617 24618 24619 24620 24621 24622
[426] 24626 24627 24629 24630 24631 25148 25149 25150 25151 25152 25153 25154 25155 25159 25160 25161 25162
[443] 25165 25679 25682 25902 25904 25905 25906 25907 25908 25909 25910 25911 25912 25913 25914 25917 25918
[460] 25919 25922 25923 25994 25996 25998 25999 26005 26006 26008 26009 26010 26011 26012 26013 26016 26017
[477] 26018 26019 26021 27408 27412 27417 27418 27419 27420 27422 27423 27427 27430 27497 27503 27504 27506
[494] 27507 27509 27510 27517 27518 27519 27535 27536 27537 27547 27548 27549 27550 27556 28026 28030 28031
[511] 28032 28034 28035 28036 28037 28038 28133 28134 28135 28136 28137 28138 28140 28151 28152 28284 28285
[528] 28286 28289 28290 28291 28292 28293 28297 28698 28699 28700 28701 28703 28705 28710 28711 28714 28715
[545] 28716 28718 28724 28727 28803 28805 28807 28899 28902 28903 28904 28907 28910 29536 29538 29539 29540
[562] 29542 29544 29547 29778 29779 29782 29826 29827 29828 29830 29831 29832 29833 29834 29897 29898 29901
[579] 29902 30020 30021 30022 30024 30025 30026 30027 30028 30233 30234 30235 30236 30237 30238 30239 30240
[596] 30242 30243 30248 30250 30252 30256 30257 30258 30259 30286 30287 30289 30290 30291 30292 30293 30294
[613] 30296 30298 30409 30410 30412 30628 30633 30634 30644 30651 30655 30657 30662 30778 30862 30864 30867
[630] 30871 30873 30874 30877 30881 30882 30883 30887 30888 30889 30890 30891 30892 30895 30909 30918 30919
[647] 30922 30924 30925 30926 30927 30928 30929 30937 30938 30989 30990 30997 30998 30999 31000 31001 31178
[664] 31180 31181 31182 31183 31184 31185 31187 31189 31190 31191 31193 31195 31196 31210 31211 31212 31213
[681] 31214 31215 31216 31218 31221 31222 31223 31225 31232 31233 31234 31235 31237 31239 31240 31241 31242
[698] 31243 31245 31246 31247 31248 31289 31295 31405 31408 31412 31673 31674 31675 31676 31677 31679 31682
[715] 31684 31685 31735 31736 31737 31738 31739 31824 31826 31831 31832 31836 31837 32169 32171 32172 32173
[732] 32358 32359 32360 32361 32363 32365 32366 32371 32373 32376 32382 32383 32483 32484 32485 32486 32487
[749] 32488 32489 32490 32491 32492 33311 33313 33328 33329 33330 33332 33335 33336 33337 33338 33339 33340
[766] 33342 33344 33345 33443 33445 33446 33454 33458 33461 33462 33463 33469 33472 33475 33480 33481 33482
[783] 33483 33484 33485 33486 33487 33488 33489 33490 33491 33601 33602 33603 33605 33606 33607 33608 33609
[800] 33610 33611 33612 33613 33614 33627 33628 33629 33630 33631 33633 33634 33645 33646 33648 33653 33654
[817] 33656 33662 33663 33680 33681 33682 33704 33750 33886 33888 33890 33892 33893 33896 33898 33903 33904
[834] 33908 33909 33910 33936 33937 33938 33939 33940 33941 33950 33952 34064 34065 36616 36617 36618 36619
[851] 36624 36625 36630 36687 36688 36689 36692 36693 36694 36695 36696 36697 36698 36699 36700 36703 36704
[868] 36705 36706 36707 36708 36709 36711 36712 36713 36714 36737 36738 36769 36770 36771 36772 36773 36774
[885] 36775 36776 36777 36778 36779 36780 36781 36782 36783 36784 36785 36786 36787 36788 36789 36790 36791
[902] 36792 36793 36794 36795 36798 36799 36800 36802 36803 36804 36805 36806 36807 36808 36809 36810 36811
[919] 36812 36814 36815 36816 36817 36818 36819 36820 36821 36822 36823 36824 36825 36826 36828 36829 36830
[936] 36831 36832 36833 36834 36835 36836 36837 36838 36839 36840 36841 36842 36843 36844 36845 36846 36847
[953] 36848 36849 36850 36851 36852 36853 36854 36855 36856 36857 36858 36859 36860 36861 36862 36863 36864
[970] 36865 36866 36867 36868 36869 36870 36871 36872 36873 36874 36875 36876 36877 36878 36879 36880 36881
[987] 36882 36883 36884 36885 36886 36887 36888 36889 36890 36891 36892 36893 36894 36895
[ reached getOption("max.print") -- omitted 706049 entries ]
sum(is.na(all_trips_v3$ride_length))
[1] 707049
sapply(all_trips_v3, function(x) sum(is.na(x)))
ride_id rideable_type started_at ended_at member_casual date month day
707049 707049 707049 707049 707049 707049 707049 707049
year day_of_week ride_length
707049 707049 707049
all_trips_v4 <- na.omit(all_trips_v3)
head(all_trips_v4,50)
nrow(all_trips_v4)
[1] 4991637
dim(all_trips_v4)
[1] 4991637 11
# Descriptive analysis on ride_length (all figures in seconds)
mean(all_trips_v4$ride_length) #straight average (total ride length / rides)
[1] 1361.479
median(all_trips_v4$ride_length) #midpoint number in the ascending array of ride lengths
[1] 726
max(all_trips_v4$ride_length) #longest ride
[1] 3356649
min(all_trips_v4$ride_length) #shortest ride
[1] 0
summary(all_trips_v4$ride_length)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 412 726 1361 1318 3356649
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual, FUN = mean)
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual, FUN = median)
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual, FUN = max)
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual, FUN = min)
# See the average ride time by each day for members vs casual users
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual + all_trips_v4$day_of_week, FUN = mean)
all_trips_v4$day_of_week <- ordered(all_trips_v4$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
# Now, let's run the average ride time by each day for members vs casual users
aggregate(all_trips_v4$ride_length ~ all_trips_v4$member_casual + all_trips_v4$day_of_week, FUN = mean)
# analyze ridership data by type and weekday
all_trips_v4 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
all_trips_v4 %>%
group_by(member_casual, month) %>% #groups by usertype and month
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, month)
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
# lets visualize number of rides by month and sort rider type
all_trips_v4 %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, month) %>%
ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
all_trips_v4 %>%
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
group_by(member_casual, month) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, month) %>%
ggplot(aes(x = month, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
# Let's visualize the number of rides by rider type
all_trips_v4 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
# Let's create a visualization for average duration
all_trips_v4 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge")
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
# lets export a summary file showing member/casual, day of week, and ride_length
counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
write.csv(counts, file = '/Users/MinJae/Documents/Google Capstone - Cyclistic/avg_ride_length.csv')
# lets export the all_trips_v4 for more analysis and visualizations in Tableau
write.csv(all_trips_v4, file= "/Users/MinJae/Documents/Google Capstone - Cyclistic/all_trips")
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.